# read data
student <- read.csv('/Users/feiyasuo/Documents/GitHub/consulting-project-pandemic-survey/student.csv')
faculty <- read.csv('/Users/feiyasuo/Documents/GitHub/consulting-project-pandemic-survey/faculty.csv')

Student Survey

1. Bar graph with 95% confidence interval

In this section, I analyze multiple-choice questions in the survey. The figures show the count for each option with a 95% confidence interval. If the confidence intervals of any two bars do not overlap, it indicates that the numbers of respondents choosing the two options are significantly different.

Question 1: For your preclinical education, what is your preferred mode of learning?

student_q1 <- data.frame(table(student[student$q1!='',]$q1)[-1])
student_q1$prop <- student_q1$Freq/sum(student_q1$Freq)
student_q1$sd <- sqrt(student_q1$prop*(1-student_q1$prop)*student_q1$Freq)

ggplot(student_q1) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q1: Preferred Mode of Learning") +
  xlab("") +
  coord_flip()

Question 2: What is your preferred method of accessing lecture material?

student_q2 <- data.frame(table(student[student$q2!='',]$q2))
student_q2 <- student_q2[-1,]
student_q2$prop <- student_q2$Freq/sum(student_q2$Freq)
student_q2$sd <- sqrt(student_q2$prop*(1-student_q2$prop)*student_q2$Freq)

ggplot(student_q2) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q2: Preferred Method of Accessing Lecture Material") +
  xlab("") +
  coord_flip()

Question 3: How often will you attend in-person lectures when offered?

student_q3 <- data.frame(table(student[student$q3!='',]$q3))
student_q3 <- student_q3[-1,]

student_q3$prop <- student_q3$Freq/sum(student_q3$Freq)
student_q3$sd <- sqrt(student_q3$prop*(1-student_q3$prop)*student_q3$Freq)

ggplot(student_q3) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q3: How often will you attend in-person lectures") +
  xlab("") +
  coord_flip()

Question 6: How easy was it connecting with other students and faculty in small groups?

student_q6 <- data.frame(table(student[student$q6!='',]$q6))
student_q6 <- student_q6[-1,]
student_q6$Var1 <- factor(student_q6$Var1, levels = c("Very easy", "Easy", 
                                                      "Neither difficult nor easy", "Difficult",
                                                      "Very difficult"))

student_q6$prop <- student_q6$Freq/sum(student_q6$Freq)
student_q6$sd <- sqrt(student_q6$prop*(1-student_q6$prop)*student_q6$Freq)


ggplot(student_q6) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q6: connecting with other students and faculty in small groups") +
  xlab("") +
  coord_flip()

Question 7: How often did you meet with your small group outside of the assigned time to go over cases?

student_q7 <- data.frame(table(student[student$q7!='',]$q7))
student_q7 <- student_q7[-1,]
student_q7$Var1 <- factor(student_q7$Var1, levels = c("Often", "Sometimes", "Rarely", "Never"))

student_q7$prop <- student_q7$Freq/sum(student_q7$Freq)
student_q7$sd <- sqrt(student_q7$prop*(1-student_q7$prop)*student_q7$Freq)

ggplot(student_q7) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q7: How often did you meet with your small group?") +
  xlab("") +
  coord_flip()

Question 8: The virtual curriculum limited your ability to form social connections with your classmates.

student_q8 <- data.frame(table(student[student$q8!='',]$q8))
student_q8 <- student_q8[-1,]
student_q8$Var1 <- factor(student_q8$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

student_q8$prop <- student_q8$Freq/sum(student_q8$Freq)
student_q8$sd <- sqrt(student_q8$prop*(1-student_q8$prop)*student_q8$Freq)

ggplot(student_q8) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q8: The virtual curriculum limited your ability to form social connections") +
  xlab("") +
  coord_flip()

Question 9: The virtual curriculum affected your ability for self care.

student_q9 <- data.frame(table(student[student$q9!='',]$q9))
student_q9 <- student_q9[-1,]
student_q9$Var1 <- factor(student_q9$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

student_q9$prop <- student_q9$Freq/sum(student_q9$Freq)
student_q9$sd <- sqrt(student_q9$prop*(1-student_q9$prop)*student_q9$Freq)

ggplot(student_q9) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q9: The virtual curriculum affected your ability for self care") +
  xlab("") +
  coord_flip()

Question 10: Replacing clinical week experiences with virtual activities impacted the quality of your clinical skills education.

student_q10 <- data.frame(table(student[student$q10!='',]$q10))
student_q10 <- student_q10[-1,]
student_q10$Var1 <- factor(student_q10$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

student_q10$prop <- student_q10$Freq/sum(student_q10$Freq)
student_q10$sd <- sqrt(student_q10$prop*(1-student_q10$prop)*student_q10$Freq)

ggplot(student_q10) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q10: Virtual activities impacted the quality of your clinical skills education") +
  xlab("") +
  coord_flip()

Question 11: COVID-19 affected the extent of your involvement in community service during pre-clinical years.

student_q11 <- data.frame(table(student[student$q11!='',]$q11))
student_q11 <- student_q11[-1,]
student_q11$Var1 <- factor(student_q11$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

student_q11$prop <- student_q11$Freq/sum(student_q11$Freq)
student_q11$sd <- sqrt(student_q11$prop*(1-student_q11$prop)*student_q11$Freq)

ggplot(student_q11) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q11: COVID-19 affected the extent of your involvement in community service") +
  xlab("") +
  coord_flip()

Question 12: Your choice of specialty will be affected due to virtual versus in-person interactions with instructors, lack of shadowing and research opportunities.

student_q12 <- data.frame(table(student[student$q12!='',]$q12))
student_q12 <- student_q12[-1,]
student_q12$Var1 <- factor(student_q12$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

student_q12$prop <- student_q12$Freq/sum(student_q12$Freq)
student_q12$sd <- sqrt(student_q12$prop*(1-student_q12$prop)*student_q12$Freq)

ggplot(student_q12) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q12: Your choice of specialty will be affected due to virtual \n versus in-person interactions with instructorsl") +
  xlab("") +
  coord_flip()

Question 13: You were able to connect to students in other class years and instructors to identify potential mentors using the Zoom format.

student_q13 <- data.frame(table(student[student$q13!='',]$q13))
student_q13 <- student_q13[-1,]
student_q13$Var1 <- factor(student_q13$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

student_q13$prop <- student_q13$Freq/sum(student_q13$Freq)
student_q13$sd <- sqrt(student_q13$prop*(1-student_q13$prop)*student_q13$Freq)

ggplot(student_q13) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q13: You were able to connect to students in other class years \n and instructors to identify potential mentors using the Zoom format") +
  xlab("") +
  coord_flip()

Question 14: I feel confident that I belong at UNC School of Medicine.

student_q14 <- data.frame(table(student[student$q14!='',]$q14))
student_q14 <- student_q14[-1,]
student_q14$Var1 <- factor(student_q14$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

student_q14$prop <- student_q14$Freq/sum(student_q14$Freq)
student_q14$sd <- sqrt(student_q14$prop*(1-student_q14$prop)*student_q14$Freq)

ggplot(student_q14) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("q14: I feel confident that I belong at UNC School of Medicine") +
  xlab("") +
  coord_flip()

2.Word Maps for Open-ended Questions

In this section, I analyze the open-ended questions. For each question, I use a word map to show the most frequent words that the respondents mentioned in the survey. The larger the words are, the more important they are (or more frequently appeared).

Question 4: How interactive are lectures using the Zoom format?

# Retrieving the text data
student_q4 <- student$q4
docs <- Corpus(VectorSource(student_q4))

# Clean the text data
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))

# Create a document-term-matrix
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)
df <- df[-1,]

# Generate the word cloud
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, 
          rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Question 5: How was the small group experience using the Zoom format?

# Retrieving the text data
student_q5 <- student$q5
docs <- Corpus(VectorSource(student_q5))

# Clean the text data
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))

# Create a document-term-matrix
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)
df <- df[c(-1,-3,-4),]

# Generate the word cloud
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, 
          rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Question 15: If you did not feel a sense of belonging, please explain further

# Retrieving the text data
student_q15 <- student$q15
docs <- Corpus(VectorSource(student_q15))

# Clean the text data
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))

# Create a document-term-matrix
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)

# Generate the word cloud
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, 
          rot.per=0.35, colors=brewer.pal(8, "Dark2"))

3. Dependency Table for Questions

This table shows the level of dependency between each pair of the multiple-choice questions. The numbers here are p-values. If the value is smaller than 0.05, it means that the we reject the null hypothesis that the two questions are independent. This indicates the two questions are correlated with each other.

student_cat <- apply(student[student$q1!="", c(3:5,8:16)], 2, as.character)
p_value <- matrix(nrow = 12, ncol = 12)

for (i in 1:11) {
  for (j in (i+1):12) {

    temp <- table(student_cat[,i], student_cat[,j])
    p_value[j,i] <- chisq.test(temp, simulate.p.value = TRUE)$p.value
    
  }
}

4. PCA Analysis

The PCA analysis can show potential clusters inside the survey results. From PCA scores, we can see that all questions have similar contribution to PCA1, except for question 3. The figure also show that all questions other than question 3 are driving the scores into one direction (to the upper right corner). Thus, there is no obvious clusters from the PCA analysis.

# change categorical variable to numeric
student_num <- student[c(5,8:16)]
levels(student_num$q3) <- c(0,3,4)
levels(student_num$q6) <- c(0,1,2,3,4,5)
levels(student_num$q7) <- c(5,4,3,2,1)
levels(student_num$q8) <- c(0,5,4,3,2,1)
levels(student_num$q9) <- c(0,5,4,3,2,1)
levels(student_num$q10) <- c(0,5,4,3,2,1)
levels(student_num$q11) <- c(0,5,4,3,2,1)
levels(student_num$q12) <- c(0,5,4,3,2,1)
levels(student_num$q13) <- c(0,5,4,3,2,1)
levels(student_num$q14) <- c(0,5,4,3,2,1)

student_num$q3 <- as.numeric(student_num$q3)
student_num$q6 <- as.numeric(student_num$q6)
student_num$q7 <- as.numeric(student_num$q7)
student_num$q8 <- as.numeric(student_num$q8)
student_num$q9 <- as.numeric(student_num$q9)
student_num$q10 <- as.numeric(student_num$q10)
student_num$q11 <- as.numeric(student_num$q11)
student_num$q12 <- as.numeric(student_num$q12)
student_num$q13 <- as.numeric(student_num$q13)
student_num$q14 <- as.numeric(student_num$q14)

# Compute the Principal Components
student_pca <- prcomp(student_num, center = TRUE,scale. = TRUE)
student_pca
## Standard deviations (1, .., p=10):
##  [1] 2.3251163 1.0237261 0.8703568 0.7721940 0.7566940 0.6392828 0.6095487
##  [8] 0.5509207 0.5484107 0.4846945
## 
## Rotation (n x k) = (10 x 10):
##           PC1         PC2          PC3        PC4         PC5         PC6
## q3  0.1300805 -0.87875785  0.063488635 -0.3473311  0.05732723 -0.03114513
## q6  0.3439932 -0.14644081  0.233113644  0.4235644  0.02426716 -0.27450239
## q7  0.3003324  0.24650549  0.318057744 -0.2906026 -0.66183689  0.17801382
## q8  0.3253917 -0.08656831 -0.531142482 -0.1831559 -0.18717526  0.45048975
## q9  0.3445623 -0.06287720 -0.098039630  0.3185741 -0.37305102 -0.30812860
## q10 0.3223695  0.21117543  0.005649359 -0.5896929  0.21598407 -0.34827891
## q11 0.3346888  0.26629253 -0.098627167 -0.1504568  0.38808260 -0.25614961
## q12 0.3598480 -0.12205262  0.176654285  0.1480359 -0.02243026 -0.11600739
## q13 0.3271367  0.03642792 -0.507362352  0.2775345  0.17813486  0.17055895
## q14 0.3122460  0.04895455  0.500359733  0.1056121  0.39294097  0.60385955
##              PC7          PC8         PC9        PC10
## q3  -0.008785297  0.184738234 -0.07415377 -0.20550766
## q6  -0.415483246  0.257530285 -0.12704888  0.54354847
## q7  -0.188994125  0.183294087 -0.28227877 -0.21144999
## q8   0.184126459 -0.044778469  0.02337552  0.54455309
## q9   0.391933374  0.149179242  0.55415109 -0.22172530
## q10 -0.317206472 -0.210485999  0.42908926  0.08199983
## q11  0.461215962  0.389990957 -0.44676003 -0.06386516
## q12  0.208188483 -0.799764452 -0.32157326 -0.04642661
## q13 -0.483133148 -0.008403488 -0.09171193 -0.50447091
## q14  0.118165967  0.074850235  0.30905677 -0.06679319
# plot pca
library(devtools)
## Loading required package: usethis
#install_github("vqv/ggbiplot")
library(ggbiplot)
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## Loading required package: grid
ggbiplot(student_pca) + coord_equal(ratio = 0.4)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

Faculty Survey

I did the same analysis for the faculty survey as what I did for the student survey. Results can bbe found in the following section.

1. Bar graph with 95% confidence interval

Question 1: For your preclinical education, what is your preferred mode of teaching?

faculty_q1 <- data.frame(table(faculty[faculty$q1!='',]$q1))
faculty_q1$prop <- faculty_q1$Freq/sum(faculty_q1$Freq)
faculty_q1$sd <- sqrt(faculty_q1$prop*(1-faculty_q1$prop)*faculty_q1$Freq)

ggplot(faculty_q1) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q1: What is your preferred mode of teaching?") +
  xlab("") +
  coord_flip()

Question 6: How easy was it connecting with students in small groups over Zoom?

faculty_q6 <- data.frame(table(faculty[faculty$q6!='',]$q6))
faculty_q6$Var1 <- factor(faculty_q6$Var1, levels = c("Very easy", "Easy", 
                                                      "Neither difficult nor easy", "Difficult",
                                                      "Very difficult"))
faculty_q6$prop <- faculty_q6$Freq/sum(faculty_q6$Freq)
faculty_q6$sd <- sqrt(faculty_q6$prop*(1-faculty_q6$prop)*faculty_q6$Freq)

ggplot(faculty_q6) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q6: How easy was it connecting with students in small groups \n over Zoom?") +
  xlab("") +
  coord_flip()

Question 9: The virtual curriculum affected your ability for self care.

faculty_q9 <- data.frame(table(faculty[faculty$q9!='',]$q9))
faculty_q9 <- faculty_q9[-1,]
faculty_q9$Var1 <- factor(faculty_q9$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

faculty_q9$prop <- faculty_q9$Freq/sum(faculty_q9$Freq)
faculty_q9$sd <- sqrt(faculty_q9$prop*(1-faculty_q9$prop)*faculty_q9$Freq)

ggplot(faculty_q9) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q9: The virtual curriculum affected your ability for self care") +
  xlab("") +
  coord_flip()

Question 12: Your ability to help students explore your speciality was affected by virtual versus inperson interactions with students, lack of shadowing and research opportunities during COVID-19.

faculty_q12 <- data.frame(table(faculty[faculty$q12!='',]$q12))
faculty_q12 <- faculty_q12[-1,]
faculty_q12$Var1 <- factor(faculty_q12$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

faculty_q12$prop <- faculty_q12$Freq/sum(faculty_q12$Freq)
faculty_q12$sd <- sqrt(faculty_q12$prop*(1-faculty_q12$prop)*faculty_q12$Freq)

ggplot(faculty_q12) +
  geom_bar(aes(x=Var1, y=Freq), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar(aes(x=Var1, ymin=Freq-sd*1.96, ymax=Freq+sd*1.96), width=0.4, colour="orange") +
  ggtitle("Q12: The virtual curriculum affected your ability for self care") +
  xlab("") +
  coord_flip()

2.Word Maps for Open-ended Questions

Question 1 (if chose “Other”) How interactive are lectures using the Zoom format?

# Retrieving the text data
faculty_q1_other <- faculty$q1_other
docs <- Corpus(VectorSource(faculty_q1_other))

# Clean the text data
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))

# Create a document-term-matrix
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)

# Generate the word cloud
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, 
          rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Question 4: How interactive are lectures using the Zoom format?

# Retrieving the text data
faculty_q4 <- faculty$q4
docs <- Corpus(VectorSource(faculty_q4))

# Clean the text data
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))

# Create a document-term-matrix
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)
df <- df[-1,]

# Generate the word cloud
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, 
          rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Question 12: If you felt that the ability to help students explore your specialty was affected, please explain further

# Retrieving the text data
faculty_q12_explain <- faculty$q12_explain
docs <- Corpus(VectorSource(faculty_q12_explain))

# Clean the text data
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))

# Create a document-term-matrix
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)

# Generate the word cloud
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, 
          rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Question 16: Other comments

# Retrieving the text data
faculty_q16 <- faculty$q16
docs <- Corpus(VectorSource(faculty_q16))

# Clean the text data
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))

# Create a document-term-matrix
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)

# Generate the word cloud
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, 
          rot.per=0.35, colors=brewer.pal(8, "Dark2"))

4. PCA Analysis

Similar to the student survey, the PCA analysis does not find any obvious clusters from the survey results. The sample size and the number of questions inside the faculty survey are both small, so it is hard to detect.

# change categorical variable to numeric
faculty_num <- faculty[c(5:7)]
levels(faculty_num$q6) <- c(0,1,2,3,4,5)
levels(faculty_num$q9) <- c(0,5,4,3,2,1)
levels(faculty_num$q12) <- c(0,5,4,3,2,1)

faculty_num$q6 <- as.numeric(faculty_num$q6)
faculty_num$q9 <- as.numeric(faculty_num$q9)
faculty_num$q12 <- as.numeric(faculty_num$q12)

# Compute the Principal Components
faculty_pca <- prcomp(faculty_num, center = TRUE,scale. = TRUE)
faculty_pca 
## Standard deviations (1, .., p=3):
## [1] 1.1746540 0.9904341 0.7995176
## 
## Rotation (n x k) = (3 x 3):
##            PC1         PC2        PC3
## q6  -0.6971962  0.04798775  0.7152725
## q9  -0.3158145 -0.91627945 -0.2463599
## q12 -0.6435672  0.39765456 -0.6539817
# plot pca
library(ggbiplot)
ggbiplot(faculty_pca) + coord_equal(ratio = 0.4)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

Dependency between the Two Surveys

In this section, I use different methods to analyze the relationship between the student and faculty surveys. We can know whether students and faculty have similar opinions on how pandemic influenced their academic activities and level of wellness.

1. Grouped Bar Plot on Common Questions

I first used a grouped bar plot with the 95% confidence interval to check if students and faculty members chose the same option for a specific question. For each option, I use bars with different colors to show the ratio of students or faculty members who chose it. If the confidence intervals do not overlap, it means the ratio of students who chose this option is significantly different from the ratio of faculty members who chose it.

Question 1

merge_q1 <- rbind(student_q1, faculty_q1[-4,])
merge_q1$category <- c(rep("student",4), rep("faculty",4))
merge_q1$sd <- sqrt(merge_q1$prop*(1-merge_q1$prop)/merge_q1$Freq)

ggplot(merge_q1, aes(fill=category, y=prop, x=Var1)) + 
    geom_bar(position="dodge", stat="identity") +
    geom_errorbar(aes(x=Var1, ymin=prop-sd*1.96, ymax=prop+sd*1.96),  position = position_dodge(0.95), width=0.4, colour="orange") +
    ggtitle("Q1: What is your preferred mode of teaching/leanring") +
    xlab("") + ylab("Proportion") +
    coord_flip()

Question 6: How easy was it connecting with faculty/students in small groups over Zoom?

merge_q6 <- rbind(student_q6, faculty_q6)
de<-data.frame("Very difficult",0,0,0)
names(de)<-c("Var1","Freq", "prop", "sd")
merge_q6 <- rbind(merge_q6, de)
merge_q6$category <- c(rep("student",5), rep("faculty",5))
merge_q6$Var1 <- factor(merge_q6$Var1, levels = c("Very easy", "Easy", 
                                                      "Neither difficult nor easy", "Difficult",
                                                      "Very difficult"))
merge_q6$sd <- sqrt(merge_q6$prop*(1-merge_q6$prop)/merge_q6$Freq)

ggplot(merge_q6, aes(fill=category, y=prop, x=Var1)) + 
    geom_bar(position="dodge", stat="identity") +
    geom_errorbar(aes(x=Var1, ymin=prop-sd*1.96, ymax=prop+sd*1.96),  position = position_dodge(0.95), width=0.4, colour="orange") +
    ggtitle("Q6: How easy was it connecting with faculty/students in small groups over Zoom?") +
    xlab("") + ylab("Proportion") +
    coord_flip()

Question 9: The virtual curriculum affected your ability for self care.

merge_q9 <- rbind(student_q9, faculty_q9)
de<-data.frame("Agree",0,0,0)
names(de)<-c("Var1","Freq", "prop", "sd")
merge_q9 <- rbind(merge_q9, de)
merge_q9$Var1 <- factor(merge_q9$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

merge_q9$category <- c(rep("student",5), rep("faculty",5))
merge_q9$sd <- sqrt(merge_q9$prop*(1-merge_q9$prop)/merge_q9$Freq)

ggplot(merge_q9, aes(fill=category, y=prop, x=Var1)) + 
    geom_bar(position="dodge", stat="identity") +
    geom_errorbar(aes(x=Var1, ymin=prop-sd*1.96, ymax=prop+sd*1.96),  position = position_dodge(0.95), width=0.4, colour="orange") +
    ggtitle("Q9: The virtual curriculum affected your ability for self care") +
    xlab("") + ylab("Proportion") +
    coord_flip()

Question 12: Your choice of specialt/your ability to help students explore your speciality was affected by virtual versus inperson interactions with instructors/students, lack of shadowing and research opportunities during COVID-19.

merge_q12 <- rbind(student_q12, faculty_q12)
de<-data.frame("Agree",0,0,0)
names(de)<-c("Var1","Freq", "prop", "sd")
merge_q12 <- rbind(merge_q12, de)
merge_q12$Var1 <- factor(merge_q12$Var1, levels = c("Strongly agree", "Agree", "Neutral", 
                                                      "Disagree", "Strongly disagree"))

merge_q12$category <- c(rep("student",5), rep("faculty",5))
merge_q12$sd <- sqrt(merge_q12$prop*(1-merge_q12$prop)/merge_q12$Freq)

ggplot(merge_q12, aes(fill=category, y=prop, x=Var1)) + 
    geom_bar(position="dodge", stat="identity") +
    geom_errorbar(aes(x=Var1, ymin=prop-sd*1.96, ymax=prop+sd*1.96),  position = position_dodge(0.95), width=0.4, colour="orange") +
    ggtitle("Q12: Speciality was affected by virtual versus inperson interactions") +
    xlab("") + ylab("Proportion") +
    coord_flip()

2. Hypothesis Testing: Do the answers from students and faculty members differ?

Then I use hypothesis testing to check the mean difference of each questions between students and faculty members. I first transfer the answers of each multiple-choice question into numbers, and then calculate the average value of each question. Finally, I can use hypothesis testing to detect any the differences between student and faculty surveys. If the p-value in a test is smaller than 0.05, it shows that the mean value of a question in student survey is significantly different from the faculty survey.

2-1: Non parametric test of mean difference

Question 1

# get common questions in student survey that are categorical & change it to numeric
student_common <- student[,c(3,8,11,14)]
levels(student_common$q1) <- c(0,1,2,3,4)
levels(student_common$q6) <- c(0,1,2,3,4,5)
levels(student_common$q9) <- c(0,5,4,3,2,1)
levels(student_common$q12) <- c(0,5,4,3,2,1)

student_common$q1 <- as.numeric(student_common$q1)
student_common$q6 <- as.numeric(student_common$q6)
student_common$q9 <- as.numeric(student_common$q9)
student_common$q12 <- as.numeric(student_common$q12)

# get common questions in faculty survey that are categorical & change it to numeric
faculty_common <- faculty[,c(2,5:7)]
levels(faculty_common$q1) <- c(1,2,3,0,4)
levels(faculty_common$q6) <- c(0,1,2,3,4,5)
levels(faculty_common$q9) <- c(0,5,4,3,2,1)
levels(faculty_common$q12) <- c(0,5,4,3,2,1)

faculty_common$q1 <- as.numeric(faculty_common$q1)
faculty_common$q6 <- as.numeric(faculty_common$q6)
faculty_common$q9 <- as.numeric(faculty_common$q9)
faculty_common$q12 <- as.numeric(faculty_common$q12)

# test for question 1
wilcox.test(student_common$q1, faculty_common$q1, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  student_common$q1 and faculty_common$q1
## W = 2134, p-value = 1.116e-07
## alternative hypothesis: true location shift is not equal to 0

Question 6

wilcox.test(student_common$q6, faculty_common$q6, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  student_common$q6 and faculty_common$q6
## W = 1542, p-value = 0.1259
## alternative hypothesis: true location shift is not equal to 0

Question 9

wilcox.test(student_common$q9, faculty_common$q9, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  student_common$q9 and faculty_common$q9
## W = 1405, p-value = 0.5257
## alternative hypothesis: true location shift is not equal to 0

Question 12

wilcox.test(student_common$q12, faculty_common$q12, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  student_common$q12 and faculty_common$q12
## W = 1268.5, p-value = 0.8176
## alternative hypothesis: true location shift is not equal to 0